#Importing all required library
import nltk
import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import word_tokenize, sent_tokenize
from wordcloud import WordCloud, STOPWORDS
from tqdm import tqdm
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
[nltk_data] Downloading package punkt to C:\Users\Amrendra [nltk_data] Mishra\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to C:\Users\Amrendra [nltk_data] Mishra\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to C:\Users\Amrendra [nltk_data] Mishra\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
1.importing 1st dataset 2.EDA on 1st dataset 3.Data pre-processing
Dataset1 = pd.read_csv("newss.csv")
import pandas as pd
import numpy as np
Dataset1.head()
| Unnamed: 0 | title | text | label | |
|---|---|---|---|---|
| 0 | 8476 | You Can Smell Hillary’s Fear | Daniel Greenfield, a Shillman Journalism Fello... | FAKE |
| 1 | 10294 | Watch The Exact Moment Paul Ryan Committed Pol... | Google Pinterest Digg Linkedin Reddit Stumbleu... | FAKE |
| 2 | 3608 | Kerry to go to Paris in gesture of sympathy | U.S. Secretary of State John F. Kerry said Mon... | REAL |
| 3 | 10142 | Bernie supporters on Twitter erupt in anger ag... | — Kaydee King (@KaydeeKing) November 9, 2016 T... | FAKE |
| 4 | 875 | The Battle of New York: Why This Primary Matters | It's primary day in New York and front-runners... | REAL |
Dataset1.tail()
| Unnamed: 0 | title | text | label | |
|---|---|---|---|---|
| 6330 | 4490 | State Department says it can't find emails fro... | The State Department told the Republican Natio... | REAL |
| 6331 | 8062 | The ‘P’ in PBS Should Stand for ‘Plutocratic’ ... | The ‘P’ in PBS Should Stand for ‘Plutocratic’ ... | FAKE |
| 6332 | 8622 | Anti-Trump Protesters Are Tools of the Oligarc... | Anti-Trump Protesters Are Tools of the Oligar... | FAKE |
| 6333 | 4021 | In Ethiopia, Obama seeks progress on peace, se... | ADDIS ABABA, Ethiopia —President Obama convene... | REAL |
| 6334 | 4330 | Jeb Bush Is Suddenly Attacking Trump. Here's W... | Jeb Bush Is Suddenly Attacking Trump. Here's W... | REAL |
Dataset1.nunique()
Unnamed: 0 6335 title 6256 text 6060 label 2 dtype: int64
Dataset1["Article"] = Dataset1["title"] + Dataset1["text"]
Dataset1.sample(frac = 1) #Shuffle 100%
Dataset1.label[Dataset1.label == 'REAL'] = 1
Dataset1.label[Dataset1.label == 'FAKE'] = 0
Dataset1 = Dataset1.loc[:,['Article','label']]
Dataset1 = Dataset1.dropna()
<ipython-input-6-586347da7020>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy Dataset1.label[Dataset1.label == 'REAL'] = 1 <ipython-input-6-586347da7020>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy Dataset1.label[Dataset1.label == 'FAKE'] = 0
In this step we will clean the data that will be used for training. The cleaning will involve these steps- 1.Removing all the extra information like brackets, any kind of puctuations - commas, apostrophes, quotes, question marks, and more. 2.Remove all the numeric text, urls
def wordpre(text):
text = text.lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub("\\W"," ",text) # remove special chars
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
return text
## Applying the wordpre method to the dataset
Dataset1['Article']=Dataset1['Article'].apply(wordpre)
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset1[Dataset1.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
<matplotlib.image.AxesImage at 0x28458edb460>
#word used in Fake news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset1[Dataset1.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
<matplotlib.image.AxesImage at 0x28474209910>
Dataset2_true = pd.read_csv("C:/Users/Amrendra Mishra/Desktop/Fake_news_Detection/Dataset/True.csv")
Dataset2_fake = pd.read_csv("C:/Users/Amrendra Mishra/Desktop/Fake_news_Detection/Dataset/Fake.csv")
Dataset2_true.nunique()
title 20826 text 21192 subject 2 date 716 dtype: int64
Dataset2_fake.nunique()
title 17903 text 17455 subject 6 date 1681 dtype: int64
#Counting by Subjects in Real news
for key,count in Dataset2_true.subject.value_counts().iteritems():
print(f"{key}:\t{count}")
#Getting Total Rows
print(f"Total Records:\t{Dataset2_true.shape[0]}")
politicsNews: 11272 worldnews: 10145 Total Records: 21417
#Counting by Subjects in Fake news
for key,count in Dataset2_fake.subject.value_counts().iteritems():
print(f"{key}:\t{count}")
#Getting Total Rows
print(f"Total Records:\t{Dataset2_fake.shape[0]}")
News: 9050 politics: 6841 left-news: 4459 Government News: 1570 US_News: 783 Middle-east: 778 Total Records: 23481
#ploting the Subjects in Real news
plt.figure(figsize=(8,5))
sns.countplot("subject", data=Dataset2_true)
plt.show()
#ploting the Subjects in Fake news
plt.figure(figsize=(8,5))
sns.countplot("subject", data=Dataset2_fake)
plt.show()
Dataset2_true['label']= 1
Dataset2_fake['label']= 0
Dataset2 = pd.concat([Dataset2_true, Dataset2_fake])
Dataset2["Article"] = Dataset2["title"] + Dataset2["text"]
Dataset2.sample(frac = 1) #Shuffle 100%
Dataset2 = Dataset2.loc[:,['Article','label']]
Dataset2
| Article | label | |
|---|---|---|
| 0 | As U.S. budget fight looms, Republicans flip t... | 1 |
| 1 | U.S. military to accept transgender recruits o... | 1 |
| 2 | Senior U.S. Republican senator: 'Let Mr. Muell... | 1 |
| 3 | FBI Russia probe helped by Australian diplomat... | 1 |
| 4 | Trump wants Postal Service to charge 'much mor... | 1 |
| ... | ... | ... |
| 23476 | McPain: John McCain Furious That Iran Treated ... | 0 |
| 23477 | JUSTICE? Yahoo Settles E-mail Privacy Class-ac... | 0 |
| 23478 | Sunnistan: US and Allied ‘Safe Zone’ Plan to T... | 0 |
| 23479 | How to Blow $700 Million: Al Jazeera America F... | 0 |
| 23480 | 10 U.S. Navy Sailors Held by Iranian Military ... | 0 |
44898 rows × 2 columns
## Applying the wordpre method to the dataset
Dataset2['Article']=Dataset2['Article'].apply(wordpre)
Dataset2.head()
| Article | label | |
|---|---|---|
| 0 | as u s budget fight looms republicans flip t... | 1 |
| 1 | u s military to accept transgender recruits o... | 1 |
| 2 | senior u s republican senator let mr muell... | 1 |
| 3 | fbi russia probe helped by australian diplomat... | 1 |
| 4 | trump wants postal service to charge much mor... | 1 |
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset2[Dataset2.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
<matplotlib.image.AxesImage at 0x28477068040>
#word used in Fake news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset2[Dataset2.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
<matplotlib.image.AxesImage at 0x28459128640>
Dataset3_real = pd.read_csv("C:/Users/Ashraf/Desktop/Fake_news_Detection/politifact/politifact_real.csv")
Dataset3_fake = pd.read_csv("C:/Users/Ashraf/Desktop/Fake_news_Detection/politifact/politifact_fake.csv")
Dataset3_real['label']= 1
Dataset3_fake['label']= 0
Dataset3 = pd.concat([Dataset3_real, Dataset3_fake])
Dataset3["Article"] = Dataset3["title"]
Dataset3.sample(frac = 1) #Shuffle 100%
Dataset3 = Dataset3.loc[:,['Article','label']]
Dataset3
| Article | label | |
|---|---|---|
| 0 | National Federation of Independent Business | 1 |
| 1 | comments in Fayetteville NC | 1 |
| 2 | Romney makes pitch, hoping to close deal : Ele... | 1 |
| 3 | Democratic Leaders Say House Democrats Are Uni... | 1 |
| 4 | Budget of the United States Government, FY 2008 | 1 |
| ... | ... | ... |
| 427 | Who is affected by the government shutdown? | 0 |
| 428 | Lindsey Graham Threatens To Convert To Democra... | 0 |
| 429 | ELECTORAL COLLEGE ELECTOR COMMITS SUICIDE TO A... | 0 |
| 430 | Sarah Palin Calls To Boycott Mall Of America B... | 0 |
| 431 | Account Suspended | 0 |
1056 rows × 2 columns
## Applying the wordpre method to the dataset
Dataset3['Article']=Dataset3['Article'].apply(wordpre)
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset3[Dataset3.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
<matplotlib.image.AxesImage at 0x284590c40a0>
#word used in Fake news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset3[Dataset3.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
<matplotlib.image.AxesImage at 0x28458da76a0>
Dataset4 = pd.read_csv("C:/Users/Ashraf/Desktop/Fake_news_Detection/Data-set/train.csv")
Dataset4.head()
| id | title | author | text | label | |
|---|---|---|---|---|---|
| 0 | 0 | House Dem Aide: We Didn’t Even See Comey’s Let... | Darrell Lucus | House Dem Aide: We Didn’t Even See Comey’s Let... | 1 |
| 1 | 1 | FLYNN: Hillary Clinton, Big Woman on Campus - ... | Daniel J. Flynn | Ever get the feeling your life circles the rou... | 0 |
| 2 | 2 | Why the Truth Might Get You Fired | Consortiumnews.com | Why the Truth Might Get You Fired October 29, ... | 1 |
| 3 | 3 | 15 Civilians Killed In Single US Airstrike Hav... | Jessica Purkiss | Videos 15 Civilians Killed In Single US Airstr... | 1 |
| 4 | 4 | Iranian woman jailed for fictional unpublished... | Howard Portnoy | Print \nAn Iranian woman has been sentenced to... | 1 |
Dataset4["Article"] = Dataset4["title"] + Dataset4["text"]
Dataset4.sample(frac = 1) #Shuffle 100%
Dataset4 = Dataset4.loc[:,['Article','label']]
Dataset4 = Dataset4.dropna()
## Applying the wordpre method to the dataset
Dataset4['Article']=Dataset4['Article'].apply(wordpre)
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset4[Dataset4.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
<matplotlib.image.AxesImage at 0x284744a6f10>
#word used in Fake news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset4[Dataset4.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
<matplotlib.image.AxesImage at 0x28466504490>
Dataset5 = pd.read_csv("C:/Users/Ashraf/Desktop/Fake_news_Detection/data.csv")
Dataset5
| URLs | Headline | Body | Label | |
|---|---|---|---|---|
| 0 | http://www.bbc.com/news/world-us-canada-414191... | Four ways Bob Corker skewered Donald Trump | Image copyright Getty Images\nOn Sunday mornin... | 1 |
| 1 | https://www.reuters.com/article/us-filmfestiva... | Linklater's war veteran comedy speaks to moder... | LONDON (Reuters) - “Last Flag Flying”, a comed... | 1 |
| 2 | https://www.nytimes.com/2017/10/09/us/politics... | Trump’s Fight With Corker Jeopardizes His Legi... | The feud broke into public view last week when... | 1 |
| 3 | https://www.reuters.com/article/us-mexico-oil-... | Egypt's Cheiron wins tie-up with Pemex for Mex... | MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin... | 1 |
| 4 | http://www.cnn.com/videos/cnnmoney/2017/10/08/... | Jason Aldean opens 'SNL' with Vegas tribute | Country singer Jason Aldean, who was performin... | 1 |
| ... | ... | ... | ... | ... |
| 4004 | http://beforeitsnews.com/sports/2017/09/trends... | Trends to Watch | Trends to Watch\n% of readers think this story... | 0 |
| 4005 | http://beforeitsnews.com/u-s-politics/2017/10/... | Trump Jr. Is Soon To Give A 30-Minute Speech F... | Trump Jr. Is Soon To Give A 30-Minute Speech F... | 0 |
| 4006 | https://www.activistpost.com/2017/09/ron-paul-... | Ron Paul on Trump, Anarchism & the AltRight | NaN | 0 |
| 4007 | https://www.reuters.com/article/us-china-pharm... | China to accept overseas trial data in bid to ... | SHANGHAI (Reuters) - China said it plans to ac... | 1 |
| 4008 | http://beforeitsnews.com/u-s-politics/2017/10/... | Vice President Mike Pence Leaves NFL Game Beca... | Vice President Mike Pence Leaves NFL Game Beca... | 0 |
4009 rows × 4 columns
Dataset5["Article"] = Dataset5["Headline"] + Dataset5["Body"]
Dataset5["label"] = Dataset5["Label"]
Dataset5.sample(frac = 1) #Shuffle 100%
Dataset5 = Dataset5.loc[:,['Article','label']]
Dataset5 = Dataset5.dropna()
## Applying the wordpre method to the dataset
Dataset5['Article']=Dataset5['Article'].apply(wordpre)
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset5[Dataset5.label== 1].Article))
plt.imshow(wc, interpolation="bilinear")
<matplotlib.image.AxesImage at 0x2846a0e8d90>
#word used in Real news
plt.figure(figsize=(15,15))
wc=WordCloud(max_words=2000, width=1600, height=700, stopwords=STOPWORDS).generate("".join(Dataset5[Dataset5.label== 0].Article))
plt.imshow(wc, interpolation="bilinear")
<matplotlib.image.AxesImage at 0x2846a38c310>
#combining all the datset into one
frames = [Dataset1, Dataset2, Dataset3, Dataset4,Dataset5]
Dataset = pd.concat(frames)
Dataset.shape
(76480, 2)
x_train,x_test,y_train,y_test = train_test_split(Dataset['Article'], Dataset['label'], test_size=0.2, random_state=2020)
x_train.shape
(61184,)
x_test.shape
(15296,)
y_train=y_train.astype('int')
y_test=y_test.astype('int')
#LogisticRegression
pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', LogisticRegression())])
Logisticmodel = pipe.fit(x_train, y_train)
prediction = Logisticmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
Logisticmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 87.04%
#####DecisionTreeClassifier
pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', DecisionTreeClassifier(criterion= 'entropy',
max_depth = 10,
splitter='best',
random_state=2020))])
DecisionTreemodel = pipe.fit(x_train, y_train)
prediction = DecisionTreemodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
DecisionTreemodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 82.07%
#####RandomForestClassifier
pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', RandomForestClassifier())])
RandomForestmodel = pipe.fit(x_train, y_train)
prediction = RandomForestmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
RandomForestmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 82.49%
#Stochastic Gradient Descent
pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', SGDClassifier())])
SGDmodel = pipe.fit(x_train, y_train)
prediction = SGDmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
SDGmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 86.23%
#GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', GradientBoostingClassifier(loss = 'deviance',
learning_rate = 0.01,
n_estimators = 10,
max_depth = 5,
random_state=55))])
GBCmodel = pipe.fit(x_train, y_train)
prediction = GBCmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
GBCmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 80.71%
#########XGBClassifier
from xgboost import XGBClassifier
pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', XGBClassifier(loss = 'deviance',
learning_rate = 0.01,
n_estimators = 10,
max_depth = 5,
random_state=2020))])
xgboostmodel = pipe.fit(x_train, y_train)
prediction = xgboostmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
xgboostmodel_accuracy = round(accuracy_score(y_test, prediction)*100,2)
[13:51:37] WARNING: C:\Users\Administrator\workspace\xgboost-win64_release_1.2.0\src\learner.cc:516:
Parameters: { loss } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
accuracy: 80.75%
#######Multinomial Naive Bayes Classifier
pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', MultinomialNB())])
MNBCmodel = pipe.fit(x_train, y_train)
prediction = MNBCmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
Multinomial_Naive_Bayes_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 78.79%
#############Bernoulli Naive Bayes Classifier
pipe = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', BernoulliNB())])
BNBCmodel = pipe.fit(x_train, y_train)
prediction = BNBCmodel.predict(x_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))
Bernoulli_Naive_Bayes_accuracy = round(accuracy_score(y_test, prediction)*100,2)
accuracy: 76.08%
x = ["SDGmodel_accuracy", "Logisticmodel_accuracy", "GBCmodel_accuracy", "xgboostmodel_accuracy" ,
"DecisionTreemodel_accuracy","RandomForestmodel_accuracy","Multinomial_Naive_Bayes_accuracy",
"Bernoulli_Naive_Bayes_accuracy"]
y = [SDGmodel_accuracy,Logisticmodel_accuracy,GBCmodel_accuracy,xgboostmodel_accuracy,
DecisionTreemodel_accuracy,RandomForestmodel_accuracy,Multinomial_Naive_Bayes_accuracy,
Bernoulli_Naive_Bayes_accuracy]
plt.barh(x, y)
for index, value in enumerate(y):
plt.text(value, index, str(value))
import joblib
# Save the model as a pickle in a file
joblib.dump(Logisticmodel, 'model.pkl')
['model.pkl']